######################Simulations for posterior predictive p-values#############
################################The Main function###############################

library(parallel)
library(MCMCpack)

##############################set parameters####################################
##### M is the number of replications for each node #####
##### nNodes is the number of nodes used for parallel computing #####
##### (M*nNodes=3000 is the number of replications) #######
##### seed is the initial random seed used in paralle computing #####
##### N is the sample size #####
##### K is used to determine the dimension of covariates #####
##### dim(X) = 4*K for regular DGP, the 5th to 4*Kth terms are independent replications of the first 4 elements.
nNodes <- 24
M <- ceiling(3000/nNodes)
seed <- 421
N <- 1000
K <- 1


# params=list() is used to control how to generate data and do the test.
#       "N" is the sample size         ; "M" is the number of iterations; 
#       "N.1" is the target sample size. If params$theta = NA, theta will be set to the root of sum(pi(X;theta+beta.logis%*%X)) = N.1
#       Here we simply set params$theta = 0.
#       "m" is the number of iterations in randomization test; "tau" is the treatment effect;
#       "is.indiv" indicates the strong or weak null hypothesis we are using;
#       "bootm" is the number of iterations in bootstrap sd estimator;
#       "beta.reg0","mu0","sd0","sign" are parameters for outcome model of Y_0: Y_0=sign*(beta.reg0%*%(X-mu)+mu_0)+N(0,sd0^2);
#       "beta.reg1","sd1","tau","sign" are parameters for outcome model of Y_1: Y_1=sign*(beta.reg1%*%(X-mu)+mu_0+tau)+N(0,sd1^2);
#       "beta.logis", theta are parameters for logistic model: ps = 1/(1+exp(-theta-beta.logis%*%X)). 
#       (If theta = NA, theta will be set to the root of sum(pi(X;theta+beta.logis%*%X)) = N.1.)
#       "extr"(useless) was used to generate extreme PS under regular DGP. If it equals TRUE, PS will be very extreme.
#       So we finally use another DGP called extreme to generate extreme PS. This parameter is useless now.
#       "finite sample" indicates whether we use finite sample or superpopulation (not used in this simulation)
#       "direct.p" is used to indicate whether return p-value or posterior distributions
#       "Method" is used to indicate the testing method we use. 
#       Method = "ppp" means posterior predictive p-values, = "normal" means normal approximations.
#       "statistic" indicates the test statistic,
#       statistic = "all" means using all statistics with four kinds of model specification
#       statistic = "dr" means using dr estimator under four kinds of model specification
#       "known.ps" indicates whether we use estimated propensity score or not in IPW estimator
#       normalize=T indicates using studentized estimator, otherwise the unstudentized estimator is used.
#       "data" indicates the DGP, data = "regular" means following the regular DGP,
#       data = "extreme" means following the extreme DGP, data = "bmi" means using the bmi dataset;
#       data = "cps" means using the cps dataset.
#       "K" is used to determine the dimension of covariates, we set it to 1 throughout our simulations.
#       dim(X) = 4*K for regular DGP, the 5th to 4*Kth terms are independent replications of the first 4 elements.
#       "match.num" indicates that we are using 1-match.num matching for cps data.
#       "is.interation" indicates whether including interation terms of covariates of cps data.
#       "boot.normalize" indicates whether using studentization in bootstrap testing, we didn't use this function.
#       "twosided" indicates using two-sided test or not, set to TRUE throughout this simulation.

params <- list(N = N,  M = M, N.1 = N/3*2,
               m = 2e3,   tau = 0,      is.indiv = F, bootm = 0,
               beta.reg0 = c(-0.1,0.2,0.2,0.2),
               beta.reg1 = c(0.1,-0.3,-0.1,0.2),
               beta.logis = c(-1,0.5,-0.25,-0.1),
               mu.0 = 1,   sd0 = 5 , sd1 = 1, theta = 0, sign = -1,
               finite.sample = F, extr = F,   direct.p = T,
               Method = "ppp", statistic = "all", known.ps = F, 
               normalize = F, data = "regular", K = K,
               match.num = 0, is.interaction = F , boot.normalize = T,
               twosided = T)


##########################Load the other files##################################
source("estimating.R",local = TRUE)
source("testing.R",local = TRUE)
source("dta_generating.R",local = TRUE)

#########################The simulation function################################
simu.test <- function(params)
{
  N <<- params$N
  M <<- params$M
  K <<- params$K
  
##########load again (parallel computing need to load files again)##############
  source("estimating.R",local = TRUE)
  source("testing.R",local = TRUE)
  source("dta_generating.R",local = TRUE)

########if we are using the true dataset, directly calculate the p-value########
##########otherwise replicate the testing procedure for m times#################
  if(params$data == "bmi" || params$data == "cps" ){
    dta <- dta_generator(params)
    N <<- length(dta$ps)
    return(cal.pvalue(params))
  }
  else
    return(replicate(params$M,cal.pvalue(params)))
}




###############################Parallel computing###############################
para.cal <- function(func,dta_l,dta = NA,seed)
{
  paramsList = list()
  for(i in 1:nNodes){
    paramsList[[i]] = params
  }
  cpucl <- makeCluster(nNodes)
  each.seed <- function(s){
    assign(".Random.seed", s, envir = .GlobalEnv)
  }
  RNGkind("L'Ecuyer-CMRG")
  set.seed(seed)
  seed0 <- .Random.seed
  seeds <- as.list(1:nNodes)
  for(i in 1:nNodes){ ###########Make different seeds for each node#############
    seed0 <- nextRNGStream(seed0)
    seeds[[i]] <- seed0
  }
############################set seeds for each node#############################
  junk <- clusterApply(cpucl, seeds, each.seed)
  clusterEvalQ(cpucl, library(MCMCpack))
  clusterEvalQ(cpucl, library(tidyverse))
  clusterEvalQ(cpucl, library(Matching))
  if(!is.na(dta))
    p.values <- parLapply(cpucl,paramsList,func,dta = dta)
  else
    p.values <- parLapply(cpucl,paramsList,func)
  p.values <- matrix(unlist(p.values),nrow = dta_l)
  stopCluster(cpucl)
  return(p.values)
}
